xref: /dpdk/examples/vhost/main.c (revision b24ec9bc1c43f7cada0b16709043f84f52f2b895)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2017 Intel Corporation
3  */
4 
5 #include <arpa/inet.h>
6 #include <getopt.h>
7 #include <linux/if_ether.h>
8 #include <linux/if_vlan.h>
9 #include <linux/virtio_net.h>
10 #include <linux/virtio_ring.h>
11 #include <signal.h>
12 #include <stdint.h>
13 #include <sys/eventfd.h>
14 #include <sys/param.h>
15 #include <unistd.h>
16 
17 #include <rte_atomic.h>
18 #include <rte_cycles.h>
19 #include <rte_ethdev.h>
20 #include <rte_log.h>
21 #include <rte_string_fns.h>
22 #include <rte_malloc.h>
23 #include <rte_vhost.h>
24 #include <rte_ip.h>
25 #include <rte_tcp.h>
26 #include <rte_pause.h>
27 
28 #include "main.h"
29 
30 #ifndef MAX_QUEUES
31 #define MAX_QUEUES 128
32 #endif
33 
34 /* the maximum number of external ports supported */
35 #define MAX_SUP_PORTS 1
36 
37 #define MBUF_CACHE_SIZE	128
38 #define MBUF_DATA_SIZE	RTE_MBUF_DEFAULT_BUF_SIZE
39 
40 #define BURST_TX_DRAIN_US 100	/* TX drain every ~100us */
41 
42 #define BURST_RX_WAIT_US 15	/* Defines how long we wait between retries on RX */
43 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
44 
45 #define JUMBO_FRAME_MAX_SIZE    0x2600
46 
47 /* State of virtio device. */
48 #define DEVICE_MAC_LEARNING 0
49 #define DEVICE_RX			1
50 #define DEVICE_SAFE_REMOVE	2
51 
52 /* Configurable number of RX/TX ring descriptors */
53 #define RTE_TEST_RX_DESC_DEFAULT 1024
54 #define RTE_TEST_TX_DESC_DEFAULT 512
55 
56 #define INVALID_PORT_ID 0xFF
57 
58 /* Max number of devices. Limited by vmdq. */
59 #define MAX_DEVICES 64
60 
61 /* Maximum long option length for option parsing. */
62 #define MAX_LONG_OPT_SZ 64
63 
64 /* mask of enabled ports */
65 static uint32_t enabled_port_mask = 0;
66 
67 /* Promiscuous mode */
68 static uint32_t promiscuous;
69 
70 /* number of devices/queues to support*/
71 static uint32_t num_queues = 0;
72 static uint32_t num_devices;
73 
74 static struct rte_mempool *mbuf_pool;
75 static int mergeable;
76 
77 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
78 typedef enum {
79 	VM2VM_DISABLED = 0,
80 	VM2VM_SOFTWARE = 1,
81 	VM2VM_HARDWARE = 2,
82 	VM2VM_LAST
83 } vm2vm_type;
84 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
85 
86 /* Enable stats. */
87 static uint32_t enable_stats = 0;
88 /* Enable retries on RX. */
89 static uint32_t enable_retry = 1;
90 
91 /* Disable TX checksum offload */
92 static uint32_t enable_tx_csum;
93 
94 /* Disable TSO offload */
95 static uint32_t enable_tso;
96 
97 static int client_mode;
98 static int dequeue_zero_copy;
99 
100 static int builtin_net_driver;
101 
102 /* Specify timeout (in useconds) between retries on RX. */
103 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
104 /* Specify the number of retries on RX. */
105 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
106 
107 /* Socket file paths. Can be set by user */
108 static char *socket_files;
109 static int nb_sockets;
110 
111 /* empty vmdq configuration structure. Filled in programatically */
112 static struct rte_eth_conf vmdq_conf_default = {
113 	.rxmode = {
114 		.mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
115 		.split_hdr_size = 0,
116 		/*
117 		 * VLAN strip is necessary for 1G NIC such as I350,
118 		 * this fixes bug of ipv4 forwarding in guest can't
119 		 * forward pakets from one virtio dev to another virtio dev.
120 		 */
121 		.offloads = (DEV_RX_OFFLOAD_CRC_STRIP |
122 			     DEV_RX_OFFLOAD_VLAN_STRIP),
123 	},
124 
125 	.txmode = {
126 		.mq_mode = ETH_MQ_TX_NONE,
127 		.offloads = (DEV_TX_OFFLOAD_IPV4_CKSUM |
128 			     DEV_TX_OFFLOAD_TCP_CKSUM |
129 			     DEV_TX_OFFLOAD_VLAN_INSERT |
130 			     DEV_TX_OFFLOAD_MULTI_SEGS |
131 			     DEV_TX_OFFLOAD_TCP_TSO),
132 	},
133 	.rx_adv_conf = {
134 		/*
135 		 * should be overridden separately in code with
136 		 * appropriate values
137 		 */
138 		.vmdq_rx_conf = {
139 			.nb_queue_pools = ETH_8_POOLS,
140 			.enable_default_pool = 0,
141 			.default_pool = 0,
142 			.nb_pool_maps = 0,
143 			.pool_map = {{0, 0},},
144 		},
145 	},
146 };
147 
148 
149 static unsigned lcore_ids[RTE_MAX_LCORE];
150 static uint16_t ports[RTE_MAX_ETHPORTS];
151 static unsigned num_ports = 0; /**< The number of ports specified in command line */
152 static uint16_t num_pf_queues, num_vmdq_queues;
153 static uint16_t vmdq_pool_base, vmdq_queue_base;
154 static uint16_t queues_per_pool;
155 
156 const uint16_t vlan_tags[] = {
157 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
158 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
159 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
160 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
161 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
162 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
163 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
164 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
165 };
166 
167 /* ethernet addresses of ports */
168 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
169 
170 static struct vhost_dev_tailq_list vhost_dev_list =
171 	TAILQ_HEAD_INITIALIZER(vhost_dev_list);
172 
173 static struct lcore_info lcore_info[RTE_MAX_LCORE];
174 
175 /* Used for queueing bursts of TX packets. */
176 struct mbuf_table {
177 	unsigned len;
178 	unsigned txq_id;
179 	struct rte_mbuf *m_table[MAX_PKT_BURST];
180 };
181 
182 /* TX queue for each data core. */
183 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
184 
185 #define MBUF_TABLE_DRAIN_TSC	((rte_get_tsc_hz() + US_PER_S - 1) \
186 				 / US_PER_S * BURST_TX_DRAIN_US)
187 #define VLAN_HLEN       4
188 
189 /*
190  * Builds up the correct configuration for VMDQ VLAN pool map
191  * according to the pool & queue limits.
192  */
193 static inline int
194 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
195 {
196 	struct rte_eth_vmdq_rx_conf conf;
197 	struct rte_eth_vmdq_rx_conf *def_conf =
198 		&vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
199 	unsigned i;
200 
201 	memset(&conf, 0, sizeof(conf));
202 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
203 	conf.nb_pool_maps = num_devices;
204 	conf.enable_loop_back = def_conf->enable_loop_back;
205 	conf.rx_mode = def_conf->rx_mode;
206 
207 	for (i = 0; i < conf.nb_pool_maps; i++) {
208 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
209 		conf.pool_map[i].pools = (1UL << i);
210 	}
211 
212 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
213 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
214 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
215 	return 0;
216 }
217 
218 /*
219  * Validate the device number according to the max pool number gotten form
220  * dev_info. If the device number is invalid, give the error message and
221  * return -1. Each device must have its own pool.
222  */
223 static inline int
224 validate_num_devices(uint32_t max_nb_devices)
225 {
226 	if (num_devices > max_nb_devices) {
227 		RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
228 		return -1;
229 	}
230 	return 0;
231 }
232 
233 /*
234  * Initialises a given port using global settings and with the rx buffers
235  * coming from the mbuf_pool passed as parameter
236  */
237 static inline int
238 port_init(uint16_t port)
239 {
240 	struct rte_eth_dev_info dev_info;
241 	struct rte_eth_conf port_conf;
242 	struct rte_eth_rxconf *rxconf;
243 	struct rte_eth_txconf *txconf;
244 	int16_t rx_rings, tx_rings;
245 	uint16_t rx_ring_size, tx_ring_size;
246 	int retval;
247 	uint16_t q;
248 
249 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
250 	rte_eth_dev_info_get (port, &dev_info);
251 
252 	rxconf = &dev_info.default_rxconf;
253 	txconf = &dev_info.default_txconf;
254 	rxconf->rx_drop_en = 1;
255 
256 	/*configure the number of supported virtio devices based on VMDQ limits */
257 	num_devices = dev_info.max_vmdq_pools;
258 
259 	rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
260 	tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
261 
262 	/*
263 	 * When dequeue zero copy is enabled, guest Tx used vring will be
264 	 * updated only when corresponding mbuf is freed. Thus, the nb_tx_desc
265 	 * (tx_ring_size here) must be small enough so that the driver will
266 	 * hit the free threshold easily and free mbufs timely. Otherwise,
267 	 * guest Tx vring would be starved.
268 	 */
269 	if (dequeue_zero_copy)
270 		tx_ring_size = 64;
271 
272 	tx_rings = (uint16_t)rte_lcore_count();
273 
274 	retval = validate_num_devices(MAX_DEVICES);
275 	if (retval < 0)
276 		return retval;
277 
278 	/* Get port configuration. */
279 	retval = get_eth_conf(&port_conf, num_devices);
280 	if (retval < 0)
281 		return retval;
282 	/* NIC queues are divided into pf queues and vmdq queues.  */
283 	num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
284 	queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
285 	num_vmdq_queues = num_devices * queues_per_pool;
286 	num_queues = num_pf_queues + num_vmdq_queues;
287 	vmdq_queue_base = dev_info.vmdq_queue_base;
288 	vmdq_pool_base  = dev_info.vmdq_pool_base;
289 	printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
290 		num_pf_queues, num_devices, queues_per_pool);
291 
292 	if (!rte_eth_dev_is_valid_port(port))
293 		return -1;
294 
295 	rx_rings = (uint16_t)dev_info.max_rx_queues;
296 	if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE)
297 		port_conf.txmode.offloads |=
298 			DEV_TX_OFFLOAD_MBUF_FAST_FREE;
299 	/* Configure ethernet device. */
300 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
301 	if (retval != 0) {
302 		RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
303 			port, strerror(-retval));
304 		return retval;
305 	}
306 
307 	retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
308 		&tx_ring_size);
309 	if (retval != 0) {
310 		RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
311 			"for port %u: %s.\n", port, strerror(-retval));
312 		return retval;
313 	}
314 	if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) {
315 		RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
316 			"for Rx queues on port %u.\n", port);
317 		return -1;
318 	}
319 
320 	/* Setup the queues. */
321 	rxconf->offloads = port_conf.rxmode.offloads;
322 	for (q = 0; q < rx_rings; q ++) {
323 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
324 						rte_eth_dev_socket_id(port),
325 						rxconf,
326 						mbuf_pool);
327 		if (retval < 0) {
328 			RTE_LOG(ERR, VHOST_PORT,
329 				"Failed to setup rx queue %u of port %u: %s.\n",
330 				q, port, strerror(-retval));
331 			return retval;
332 		}
333 	}
334 	txconf->offloads = port_conf.txmode.offloads;
335 	for (q = 0; q < tx_rings; q ++) {
336 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
337 						rte_eth_dev_socket_id(port),
338 						txconf);
339 		if (retval < 0) {
340 			RTE_LOG(ERR, VHOST_PORT,
341 				"Failed to setup tx queue %u of port %u: %s.\n",
342 				q, port, strerror(-retval));
343 			return retval;
344 		}
345 	}
346 
347 	/* Start the device. */
348 	retval  = rte_eth_dev_start(port);
349 	if (retval < 0) {
350 		RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
351 			port, strerror(-retval));
352 		return retval;
353 	}
354 
355 	if (promiscuous)
356 		rte_eth_promiscuous_enable(port);
357 
358 	rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
359 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
360 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
361 			" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
362 			port,
363 			vmdq_ports_eth_addr[port].addr_bytes[0],
364 			vmdq_ports_eth_addr[port].addr_bytes[1],
365 			vmdq_ports_eth_addr[port].addr_bytes[2],
366 			vmdq_ports_eth_addr[port].addr_bytes[3],
367 			vmdq_ports_eth_addr[port].addr_bytes[4],
368 			vmdq_ports_eth_addr[port].addr_bytes[5]);
369 
370 	return 0;
371 }
372 
373 /*
374  * Set socket file path.
375  */
376 static int
377 us_vhost_parse_socket_path(const char *q_arg)
378 {
379 	/* parse number string */
380 	if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
381 		return -1;
382 
383 	socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
384 	snprintf(socket_files + nb_sockets * PATH_MAX, PATH_MAX, "%s", q_arg);
385 	nb_sockets++;
386 
387 	return 0;
388 }
389 
390 /*
391  * Parse the portmask provided at run time.
392  */
393 static int
394 parse_portmask(const char *portmask)
395 {
396 	char *end = NULL;
397 	unsigned long pm;
398 
399 	errno = 0;
400 
401 	/* parse hexadecimal string */
402 	pm = strtoul(portmask, &end, 16);
403 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
404 		return -1;
405 
406 	if (pm == 0)
407 		return -1;
408 
409 	return pm;
410 
411 }
412 
413 /*
414  * Parse num options at run time.
415  */
416 static int
417 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
418 {
419 	char *end = NULL;
420 	unsigned long num;
421 
422 	errno = 0;
423 
424 	/* parse unsigned int string */
425 	num = strtoul(q_arg, &end, 10);
426 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
427 		return -1;
428 
429 	if (num > max_valid_value)
430 		return -1;
431 
432 	return num;
433 
434 }
435 
436 /*
437  * Display usage
438  */
439 static void
440 us_vhost_usage(const char *prgname)
441 {
442 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
443 	"		--vm2vm [0|1|2]\n"
444 	"		--rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
445 	"		--socket-file <path>\n"
446 	"		--nb-devices ND\n"
447 	"		-p PORTMASK: Set mask for ports to be used by application\n"
448 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
449 	"		--rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
450 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
451 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
452 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
453 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
454 	"		--socket-file: The path of the socket file.\n"
455 	"		--tx-csum [0|1] disable/enable TX checksum offload.\n"
456 	"		--tso [0|1] disable/enable TCP segment offload.\n"
457 	"		--client register a vhost-user socket as client mode.\n"
458 	"		--dequeue-zero-copy enables dequeue zero copy\n",
459 	       prgname);
460 }
461 
462 /*
463  * Parse the arguments given in the command line of the application.
464  */
465 static int
466 us_vhost_parse_args(int argc, char **argv)
467 {
468 	int opt, ret;
469 	int option_index;
470 	unsigned i;
471 	const char *prgname = argv[0];
472 	static struct option long_option[] = {
473 		{"vm2vm", required_argument, NULL, 0},
474 		{"rx-retry", required_argument, NULL, 0},
475 		{"rx-retry-delay", required_argument, NULL, 0},
476 		{"rx-retry-num", required_argument, NULL, 0},
477 		{"mergeable", required_argument, NULL, 0},
478 		{"stats", required_argument, NULL, 0},
479 		{"socket-file", required_argument, NULL, 0},
480 		{"tx-csum", required_argument, NULL, 0},
481 		{"tso", required_argument, NULL, 0},
482 		{"client", no_argument, &client_mode, 1},
483 		{"dequeue-zero-copy", no_argument, &dequeue_zero_copy, 1},
484 		{"builtin-net-driver", no_argument, &builtin_net_driver, 1},
485 		{NULL, 0, 0, 0},
486 	};
487 
488 	/* Parse command line */
489 	while ((opt = getopt_long(argc, argv, "p:P",
490 			long_option, &option_index)) != EOF) {
491 		switch (opt) {
492 		/* Portmask */
493 		case 'p':
494 			enabled_port_mask = parse_portmask(optarg);
495 			if (enabled_port_mask == 0) {
496 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
497 				us_vhost_usage(prgname);
498 				return -1;
499 			}
500 			break;
501 
502 		case 'P':
503 			promiscuous = 1;
504 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
505 				ETH_VMDQ_ACCEPT_BROADCAST |
506 				ETH_VMDQ_ACCEPT_MULTICAST;
507 
508 			break;
509 
510 		case 0:
511 			/* Enable/disable vm2vm comms. */
512 			if (!strncmp(long_option[option_index].name, "vm2vm",
513 				MAX_LONG_OPT_SZ)) {
514 				ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
515 				if (ret == -1) {
516 					RTE_LOG(INFO, VHOST_CONFIG,
517 						"Invalid argument for "
518 						"vm2vm [0|1|2]\n");
519 					us_vhost_usage(prgname);
520 					return -1;
521 				} else {
522 					vm2vm_mode = (vm2vm_type)ret;
523 				}
524 			}
525 
526 			/* Enable/disable retries on RX. */
527 			if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
528 				ret = parse_num_opt(optarg, 1);
529 				if (ret == -1) {
530 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
531 					us_vhost_usage(prgname);
532 					return -1;
533 				} else {
534 					enable_retry = ret;
535 				}
536 			}
537 
538 			/* Enable/disable TX checksum offload. */
539 			if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) {
540 				ret = parse_num_opt(optarg, 1);
541 				if (ret == -1) {
542 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
543 					us_vhost_usage(prgname);
544 					return -1;
545 				} else
546 					enable_tx_csum = ret;
547 			}
548 
549 			/* Enable/disable TSO offload. */
550 			if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) {
551 				ret = parse_num_opt(optarg, 1);
552 				if (ret == -1) {
553 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
554 					us_vhost_usage(prgname);
555 					return -1;
556 				} else
557 					enable_tso = ret;
558 			}
559 
560 			/* Specify the retries delay time (in useconds) on RX. */
561 			if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
562 				ret = parse_num_opt(optarg, INT32_MAX);
563 				if (ret == -1) {
564 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
565 					us_vhost_usage(prgname);
566 					return -1;
567 				} else {
568 					burst_rx_delay_time = ret;
569 				}
570 			}
571 
572 			/* Specify the retries number on RX. */
573 			if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
574 				ret = parse_num_opt(optarg, INT32_MAX);
575 				if (ret == -1) {
576 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
577 					us_vhost_usage(prgname);
578 					return -1;
579 				} else {
580 					burst_rx_retry_num = ret;
581 				}
582 			}
583 
584 			/* Enable/disable RX mergeable buffers. */
585 			if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
586 				ret = parse_num_opt(optarg, 1);
587 				if (ret == -1) {
588 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
589 					us_vhost_usage(prgname);
590 					return -1;
591 				} else {
592 					mergeable = !!ret;
593 					if (ret) {
594 						vmdq_conf_default.rxmode.offloads |=
595 							DEV_RX_OFFLOAD_JUMBO_FRAME;
596 						vmdq_conf_default.rxmode.max_rx_pkt_len
597 							= JUMBO_FRAME_MAX_SIZE;
598 					}
599 				}
600 			}
601 
602 			/* Enable/disable stats. */
603 			if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
604 				ret = parse_num_opt(optarg, INT32_MAX);
605 				if (ret == -1) {
606 					RTE_LOG(INFO, VHOST_CONFIG,
607 						"Invalid argument for stats [0..N]\n");
608 					us_vhost_usage(prgname);
609 					return -1;
610 				} else {
611 					enable_stats = ret;
612 				}
613 			}
614 
615 			/* Set socket file path. */
616 			if (!strncmp(long_option[option_index].name,
617 						"socket-file", MAX_LONG_OPT_SZ)) {
618 				if (us_vhost_parse_socket_path(optarg) == -1) {
619 					RTE_LOG(INFO, VHOST_CONFIG,
620 					"Invalid argument for socket name (Max %d characters)\n",
621 					PATH_MAX);
622 					us_vhost_usage(prgname);
623 					return -1;
624 				}
625 			}
626 
627 			break;
628 
629 			/* Invalid option - print options. */
630 		default:
631 			us_vhost_usage(prgname);
632 			return -1;
633 		}
634 	}
635 
636 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
637 		if (enabled_port_mask & (1 << i))
638 			ports[num_ports++] = i;
639 	}
640 
641 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
642 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
643 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
644 		return -1;
645 	}
646 
647 	return 0;
648 }
649 
650 /*
651  * Update the global var NUM_PORTS and array PORTS according to system ports number
652  * and return valid ports number
653  */
654 static unsigned check_ports_num(unsigned nb_ports)
655 {
656 	unsigned valid_num_ports = num_ports;
657 	unsigned portid;
658 
659 	if (num_ports > nb_ports) {
660 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
661 			num_ports, nb_ports);
662 		num_ports = nb_ports;
663 	}
664 
665 	for (portid = 0; portid < num_ports; portid ++) {
666 		if (!rte_eth_dev_is_valid_port(ports[portid])) {
667 			RTE_LOG(INFO, VHOST_PORT,
668 				"\nSpecified port ID(%u) is not valid\n",
669 				ports[portid]);
670 			ports[portid] = INVALID_PORT_ID;
671 			valid_num_ports--;
672 		}
673 	}
674 	return valid_num_ports;
675 }
676 
677 static __rte_always_inline struct vhost_dev *
678 find_vhost_dev(struct ether_addr *mac)
679 {
680 	struct vhost_dev *vdev;
681 
682 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
683 		if (vdev->ready == DEVICE_RX &&
684 		    is_same_ether_addr(mac, &vdev->mac_address))
685 			return vdev;
686 	}
687 
688 	return NULL;
689 }
690 
691 /*
692  * This function learns the MAC address of the device and registers this along with a
693  * vlan tag to a VMDQ.
694  */
695 static int
696 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
697 {
698 	struct ether_hdr *pkt_hdr;
699 	int i, ret;
700 
701 	/* Learn MAC address of guest device from packet */
702 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
703 
704 	if (find_vhost_dev(&pkt_hdr->s_addr)) {
705 		RTE_LOG(ERR, VHOST_DATA,
706 			"(%d) device is using a registered MAC!\n",
707 			vdev->vid);
708 		return -1;
709 	}
710 
711 	for (i = 0; i < ETHER_ADDR_LEN; i++)
712 		vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
713 
714 	/* vlan_tag currently uses the device_id. */
715 	vdev->vlan_tag = vlan_tags[vdev->vid];
716 
717 	/* Print out VMDQ registration info. */
718 	RTE_LOG(INFO, VHOST_DATA,
719 		"(%d) mac %02x:%02x:%02x:%02x:%02x:%02x and vlan %d registered\n",
720 		vdev->vid,
721 		vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
722 		vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
723 		vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
724 		vdev->vlan_tag);
725 
726 	/* Register the MAC address. */
727 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
728 				(uint32_t)vdev->vid + vmdq_pool_base);
729 	if (ret)
730 		RTE_LOG(ERR, VHOST_DATA,
731 			"(%d) failed to add device MAC address to VMDQ\n",
732 			vdev->vid);
733 
734 	rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
735 
736 	/* Set device as ready for RX. */
737 	vdev->ready = DEVICE_RX;
738 
739 	return 0;
740 }
741 
742 /*
743  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
744  * queue before disabling RX on the device.
745  */
746 static inline void
747 unlink_vmdq(struct vhost_dev *vdev)
748 {
749 	unsigned i = 0;
750 	unsigned rx_count;
751 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
752 
753 	if (vdev->ready == DEVICE_RX) {
754 		/*clear MAC and VLAN settings*/
755 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
756 		for (i = 0; i < 6; i++)
757 			vdev->mac_address.addr_bytes[i] = 0;
758 
759 		vdev->vlan_tag = 0;
760 
761 		/*Clear out the receive buffers*/
762 		rx_count = rte_eth_rx_burst(ports[0],
763 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
764 
765 		while (rx_count) {
766 			for (i = 0; i < rx_count; i++)
767 				rte_pktmbuf_free(pkts_burst[i]);
768 
769 			rx_count = rte_eth_rx_burst(ports[0],
770 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
771 		}
772 
773 		vdev->ready = DEVICE_MAC_LEARNING;
774 	}
775 }
776 
777 static __rte_always_inline void
778 virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
779 	    struct rte_mbuf *m)
780 {
781 	uint16_t ret;
782 
783 	if (builtin_net_driver) {
784 		ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
785 	} else {
786 		ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
787 	}
788 
789 	if (enable_stats) {
790 		rte_atomic64_inc(&dst_vdev->stats.rx_total_atomic);
791 		rte_atomic64_add(&dst_vdev->stats.rx_atomic, ret);
792 		src_vdev->stats.tx_total++;
793 		src_vdev->stats.tx += ret;
794 	}
795 }
796 
797 /*
798  * Check if the packet destination MAC address is for a local device. If so then put
799  * the packet on that devices RX queue. If not then return.
800  */
801 static __rte_always_inline int
802 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
803 {
804 	struct ether_hdr *pkt_hdr;
805 	struct vhost_dev *dst_vdev;
806 
807 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
808 
809 	dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
810 	if (!dst_vdev)
811 		return -1;
812 
813 	if (vdev->vid == dst_vdev->vid) {
814 		RTE_LOG_DP(DEBUG, VHOST_DATA,
815 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
816 			vdev->vid);
817 		return 0;
818 	}
819 
820 	RTE_LOG_DP(DEBUG, VHOST_DATA,
821 		"(%d) TX: MAC address is local\n", dst_vdev->vid);
822 
823 	if (unlikely(dst_vdev->remove)) {
824 		RTE_LOG_DP(DEBUG, VHOST_DATA,
825 			"(%d) device is marked for removal\n", dst_vdev->vid);
826 		return 0;
827 	}
828 
829 	virtio_xmit(dst_vdev, vdev, m);
830 	return 0;
831 }
832 
833 /*
834  * Check if the destination MAC of a packet is one local VM,
835  * and get its vlan tag, and offset if it is.
836  */
837 static __rte_always_inline int
838 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
839 	uint32_t *offset, uint16_t *vlan_tag)
840 {
841 	struct vhost_dev *dst_vdev;
842 	struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
843 
844 	dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
845 	if (!dst_vdev)
846 		return 0;
847 
848 	if (vdev->vid == dst_vdev->vid) {
849 		RTE_LOG_DP(DEBUG, VHOST_DATA,
850 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
851 			vdev->vid);
852 		return -1;
853 	}
854 
855 	/*
856 	 * HW vlan strip will reduce the packet length
857 	 * by minus length of vlan tag, so need restore
858 	 * the packet length by plus it.
859 	 */
860 	*offset  = VLAN_HLEN;
861 	*vlan_tag = vlan_tags[vdev->vid];
862 
863 	RTE_LOG_DP(DEBUG, VHOST_DATA,
864 		"(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
865 		vdev->vid, dst_vdev->vid, *vlan_tag);
866 
867 	return 0;
868 }
869 
870 static uint16_t
871 get_psd_sum(void *l3_hdr, uint64_t ol_flags)
872 {
873 	if (ol_flags & PKT_TX_IPV4)
874 		return rte_ipv4_phdr_cksum(l3_hdr, ol_flags);
875 	else /* assume ethertype == ETHER_TYPE_IPv6 */
876 		return rte_ipv6_phdr_cksum(l3_hdr, ol_flags);
877 }
878 
879 static void virtio_tx_offload(struct rte_mbuf *m)
880 {
881 	void *l3_hdr;
882 	struct ipv4_hdr *ipv4_hdr = NULL;
883 	struct tcp_hdr *tcp_hdr = NULL;
884 	struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
885 
886 	l3_hdr = (char *)eth_hdr + m->l2_len;
887 
888 	if (m->ol_flags & PKT_TX_IPV4) {
889 		ipv4_hdr = l3_hdr;
890 		ipv4_hdr->hdr_checksum = 0;
891 		m->ol_flags |= PKT_TX_IP_CKSUM;
892 	}
893 
894 	tcp_hdr = (struct tcp_hdr *)((char *)l3_hdr + m->l3_len);
895 	tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags);
896 }
897 
898 static inline void
899 free_pkts(struct rte_mbuf **pkts, uint16_t n)
900 {
901 	while (n--)
902 		rte_pktmbuf_free(pkts[n]);
903 }
904 
905 static __rte_always_inline void
906 do_drain_mbuf_table(struct mbuf_table *tx_q)
907 {
908 	uint16_t count;
909 
910 	count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
911 				 tx_q->m_table, tx_q->len);
912 	if (unlikely(count < tx_q->len))
913 		free_pkts(&tx_q->m_table[count], tx_q->len - count);
914 
915 	tx_q->len = 0;
916 }
917 
918 /*
919  * This function routes the TX packet to the correct interface. This
920  * may be a local device or the physical port.
921  */
922 static __rte_always_inline void
923 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
924 {
925 	struct mbuf_table *tx_q;
926 	unsigned offset = 0;
927 	const uint16_t lcore_id = rte_lcore_id();
928 	struct ether_hdr *nh;
929 
930 
931 	nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
932 	if (unlikely(is_broadcast_ether_addr(&nh->d_addr))) {
933 		struct vhost_dev *vdev2;
934 
935 		TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
936 			if (vdev2 != vdev)
937 				virtio_xmit(vdev2, vdev, m);
938 		}
939 		goto queue2nic;
940 	}
941 
942 	/*check if destination is local VM*/
943 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
944 		rte_pktmbuf_free(m);
945 		return;
946 	}
947 
948 	if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
949 		if (unlikely(find_local_dest(vdev, m, &offset,
950 					     &vlan_tag) != 0)) {
951 			rte_pktmbuf_free(m);
952 			return;
953 		}
954 	}
955 
956 	RTE_LOG_DP(DEBUG, VHOST_DATA,
957 		"(%d) TX: MAC address is external\n", vdev->vid);
958 
959 queue2nic:
960 
961 	/*Add packet to the port tx queue*/
962 	tx_q = &lcore_tx_queue[lcore_id];
963 
964 	nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
965 	if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) {
966 		/* Guest has inserted the vlan tag. */
967 		struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1);
968 		uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
969 		if ((vm2vm_mode == VM2VM_HARDWARE) &&
970 			(vh->vlan_tci != vlan_tag_be))
971 			vh->vlan_tci = vlan_tag_be;
972 	} else {
973 		m->ol_flags |= PKT_TX_VLAN_PKT;
974 
975 		/*
976 		 * Find the right seg to adjust the data len when offset is
977 		 * bigger than tail room size.
978 		 */
979 		if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
980 			if (likely(offset <= rte_pktmbuf_tailroom(m)))
981 				m->data_len += offset;
982 			else {
983 				struct rte_mbuf *seg = m;
984 
985 				while ((seg->next != NULL) &&
986 					(offset > rte_pktmbuf_tailroom(seg)))
987 					seg = seg->next;
988 
989 				seg->data_len += offset;
990 			}
991 			m->pkt_len += offset;
992 		}
993 
994 		m->vlan_tci = vlan_tag;
995 	}
996 
997 	if (m->ol_flags & PKT_TX_TCP_SEG)
998 		virtio_tx_offload(m);
999 
1000 	tx_q->m_table[tx_q->len++] = m;
1001 	if (enable_stats) {
1002 		vdev->stats.tx_total++;
1003 		vdev->stats.tx++;
1004 	}
1005 
1006 	if (unlikely(tx_q->len == MAX_PKT_BURST))
1007 		do_drain_mbuf_table(tx_q);
1008 }
1009 
1010 
1011 static __rte_always_inline void
1012 drain_mbuf_table(struct mbuf_table *tx_q)
1013 {
1014 	static uint64_t prev_tsc;
1015 	uint64_t cur_tsc;
1016 
1017 	if (tx_q->len == 0)
1018 		return;
1019 
1020 	cur_tsc = rte_rdtsc();
1021 	if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1022 		prev_tsc = cur_tsc;
1023 
1024 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1025 			"TX queue drained after timeout with burst size %u\n",
1026 			tx_q->len);
1027 		do_drain_mbuf_table(tx_q);
1028 	}
1029 }
1030 
1031 static __rte_always_inline void
1032 drain_eth_rx(struct vhost_dev *vdev)
1033 {
1034 	uint16_t rx_count, enqueue_count;
1035 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1036 
1037 	rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1038 				    pkts, MAX_PKT_BURST);
1039 	if (!rx_count)
1040 		return;
1041 
1042 	/*
1043 	 * When "enable_retry" is set, here we wait and retry when there
1044 	 * is no enough free slots in the queue to hold @rx_count packets,
1045 	 * to diminish packet loss.
1046 	 */
1047 	if (enable_retry &&
1048 	    unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1049 			VIRTIO_RXQ))) {
1050 		uint32_t retry;
1051 
1052 		for (retry = 0; retry < burst_rx_retry_num; retry++) {
1053 			rte_delay_us(burst_rx_delay_time);
1054 			if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1055 					VIRTIO_RXQ))
1056 				break;
1057 		}
1058 	}
1059 
1060 	if (builtin_net_driver) {
1061 		enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
1062 						pkts, rx_count);
1063 	} else {
1064 		enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1065 						pkts, rx_count);
1066 	}
1067 	if (enable_stats) {
1068 		rte_atomic64_add(&vdev->stats.rx_total_atomic, rx_count);
1069 		rte_atomic64_add(&vdev->stats.rx_atomic, enqueue_count);
1070 	}
1071 
1072 	free_pkts(pkts, rx_count);
1073 }
1074 
1075 static __rte_always_inline void
1076 drain_virtio_tx(struct vhost_dev *vdev)
1077 {
1078 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1079 	uint16_t count;
1080 	uint16_t i;
1081 
1082 	if (builtin_net_driver) {
1083 		count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
1084 					pkts, MAX_PKT_BURST);
1085 	} else {
1086 		count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
1087 					mbuf_pool, pkts, MAX_PKT_BURST);
1088 	}
1089 
1090 	/* setup VMDq for the first packet */
1091 	if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1092 		if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1093 			free_pkts(pkts, count);
1094 	}
1095 
1096 	for (i = 0; i < count; ++i)
1097 		virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1098 }
1099 
1100 /*
1101  * Main function of vhost-switch. It basically does:
1102  *
1103  * for each vhost device {
1104  *    - drain_eth_rx()
1105  *
1106  *      Which drains the host eth Rx queue linked to the vhost device,
1107  *      and deliver all of them to guest virito Rx ring associated with
1108  *      this vhost device.
1109  *
1110  *    - drain_virtio_tx()
1111  *
1112  *      Which drains the guest virtio Tx queue and deliver all of them
1113  *      to the target, which could be another vhost device, or the
1114  *      physical eth dev. The route is done in function "virtio_tx_route".
1115  * }
1116  */
1117 static int
1118 switch_worker(void *arg __rte_unused)
1119 {
1120 	unsigned i;
1121 	unsigned lcore_id = rte_lcore_id();
1122 	struct vhost_dev *vdev;
1123 	struct mbuf_table *tx_q;
1124 
1125 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1126 
1127 	tx_q = &lcore_tx_queue[lcore_id];
1128 	for (i = 0; i < rte_lcore_count(); i++) {
1129 		if (lcore_ids[i] == lcore_id) {
1130 			tx_q->txq_id = i;
1131 			break;
1132 		}
1133 	}
1134 
1135 	while(1) {
1136 		drain_mbuf_table(tx_q);
1137 
1138 		/*
1139 		 * Inform the configuration core that we have exited the
1140 		 * linked list and that no devices are in use if requested.
1141 		 */
1142 		if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1143 			lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1144 
1145 		/*
1146 		 * Process vhost devices
1147 		 */
1148 		TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1149 			      lcore_vdev_entry) {
1150 			if (unlikely(vdev->remove)) {
1151 				unlink_vmdq(vdev);
1152 				vdev->ready = DEVICE_SAFE_REMOVE;
1153 				continue;
1154 			}
1155 
1156 			if (likely(vdev->ready == DEVICE_RX))
1157 				drain_eth_rx(vdev);
1158 
1159 			if (likely(!vdev->remove))
1160 				drain_virtio_tx(vdev);
1161 		}
1162 	}
1163 
1164 	return 0;
1165 }
1166 
1167 /*
1168  * Remove a device from the specific data core linked list and from the
1169  * main linked list. Synchonization  occurs through the use of the
1170  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1171  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1172  */
1173 static void
1174 destroy_device(int vid)
1175 {
1176 	struct vhost_dev *vdev = NULL;
1177 	int lcore;
1178 
1179 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1180 		if (vdev->vid == vid)
1181 			break;
1182 	}
1183 	if (!vdev)
1184 		return;
1185 	/*set the remove flag. */
1186 	vdev->remove = 1;
1187 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
1188 		rte_pause();
1189 	}
1190 
1191 	if (builtin_net_driver)
1192 		vs_vhost_net_remove(vdev);
1193 
1194 	TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1195 		     lcore_vdev_entry);
1196 	TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1197 
1198 
1199 	/* Set the dev_removal_flag on each lcore. */
1200 	RTE_LCORE_FOREACH_SLAVE(lcore)
1201 		lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1202 
1203 	/*
1204 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1205 	 * we can be sure that they can no longer access the device removed
1206 	 * from the linked lists and that the devices are no longer in use.
1207 	 */
1208 	RTE_LCORE_FOREACH_SLAVE(lcore) {
1209 		while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1210 			rte_pause();
1211 	}
1212 
1213 	lcore_info[vdev->coreid].device_num--;
1214 
1215 	RTE_LOG(INFO, VHOST_DATA,
1216 		"(%d) device has been removed from data core\n",
1217 		vdev->vid);
1218 
1219 	rte_free(vdev);
1220 }
1221 
1222 /*
1223  * A new device is added to a data core. First the device is added to the main linked list
1224  * and the allocated to a specific data core.
1225  */
1226 static int
1227 new_device(int vid)
1228 {
1229 	int lcore, core_add = 0;
1230 	uint32_t device_num_min = num_devices;
1231 	struct vhost_dev *vdev;
1232 
1233 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1234 	if (vdev == NULL) {
1235 		RTE_LOG(INFO, VHOST_DATA,
1236 			"(%d) couldn't allocate memory for vhost dev\n",
1237 			vid);
1238 		return -1;
1239 	}
1240 	vdev->vid = vid;
1241 
1242 	if (builtin_net_driver)
1243 		vs_vhost_net_setup(vdev);
1244 
1245 	TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1246 	vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1247 
1248 	/*reset ready flag*/
1249 	vdev->ready = DEVICE_MAC_LEARNING;
1250 	vdev->remove = 0;
1251 
1252 	/* Find a suitable lcore to add the device. */
1253 	RTE_LCORE_FOREACH_SLAVE(lcore) {
1254 		if (lcore_info[lcore].device_num < device_num_min) {
1255 			device_num_min = lcore_info[lcore].device_num;
1256 			core_add = lcore;
1257 		}
1258 	}
1259 	vdev->coreid = core_add;
1260 
1261 	TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1262 			  lcore_vdev_entry);
1263 	lcore_info[vdev->coreid].device_num++;
1264 
1265 	/* Disable notifications. */
1266 	rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1267 	rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1268 
1269 	RTE_LOG(INFO, VHOST_DATA,
1270 		"(%d) device has been added to data core %d\n",
1271 		vid, vdev->coreid);
1272 
1273 	return 0;
1274 }
1275 
1276 /*
1277  * These callback allow devices to be added to the data core when configuration
1278  * has been fully complete.
1279  */
1280 static const struct vhost_device_ops virtio_net_device_ops =
1281 {
1282 	.new_device =  new_device,
1283 	.destroy_device = destroy_device,
1284 };
1285 
1286 /*
1287  * This is a thread will wake up after a period to print stats if the user has
1288  * enabled them.
1289  */
1290 static void *
1291 print_stats(__rte_unused void *arg)
1292 {
1293 	struct vhost_dev *vdev;
1294 	uint64_t tx_dropped, rx_dropped;
1295 	uint64_t tx, tx_total, rx, rx_total;
1296 	const char clr[] = { 27, '[', '2', 'J', '\0' };
1297 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1298 
1299 	while(1) {
1300 		sleep(enable_stats);
1301 
1302 		/* Clear screen and move to top left */
1303 		printf("%s%s\n", clr, top_left);
1304 		printf("Device statistics =================================\n");
1305 
1306 		TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1307 			tx_total   = vdev->stats.tx_total;
1308 			tx         = vdev->stats.tx;
1309 			tx_dropped = tx_total - tx;
1310 
1311 			rx_total   = rte_atomic64_read(&vdev->stats.rx_total_atomic);
1312 			rx         = rte_atomic64_read(&vdev->stats.rx_atomic);
1313 			rx_dropped = rx_total - rx;
1314 
1315 			printf("Statistics for device %d\n"
1316 				"-----------------------\n"
1317 				"TX total:              %" PRIu64 "\n"
1318 				"TX dropped:            %" PRIu64 "\n"
1319 				"TX successful:         %" PRIu64 "\n"
1320 				"RX total:              %" PRIu64 "\n"
1321 				"RX dropped:            %" PRIu64 "\n"
1322 				"RX successful:         %" PRIu64 "\n",
1323 				vdev->vid,
1324 				tx_total, tx_dropped, tx,
1325 				rx_total, rx_dropped, rx);
1326 		}
1327 
1328 		printf("===================================================\n");
1329 	}
1330 
1331 	return NULL;
1332 }
1333 
1334 static void
1335 unregister_drivers(int socket_num)
1336 {
1337 	int i, ret;
1338 
1339 	for (i = 0; i < socket_num; i++) {
1340 		ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1341 		if (ret != 0)
1342 			RTE_LOG(ERR, VHOST_CONFIG,
1343 				"Fail to unregister vhost driver for %s.\n",
1344 				socket_files + i * PATH_MAX);
1345 	}
1346 }
1347 
1348 /* When we receive a INT signal, unregister vhost driver */
1349 static void
1350 sigint_handler(__rte_unused int signum)
1351 {
1352 	/* Unregister vhost driver. */
1353 	unregister_drivers(nb_sockets);
1354 
1355 	exit(0);
1356 }
1357 
1358 /*
1359  * While creating an mbuf pool, one key thing is to figure out how
1360  * many mbuf entries is enough for our use. FYI, here are some
1361  * guidelines:
1362  *
1363  * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage
1364  *
1365  * - For each switch core (A CPU core does the packet switch), we need
1366  *   also make some reservation for receiving the packets from virtio
1367  *   Tx queue. How many is enough depends on the usage. It's normally
1368  *   a simple calculation like following:
1369  *
1370  *       MAX_PKT_BURST * max packet size / mbuf size
1371  *
1372  *   So, we definitely need allocate more mbufs when TSO is enabled.
1373  *
1374  * - Similarly, for each switching core, we should serve @nr_rx_desc
1375  *   mbufs for receiving the packets from physical NIC device.
1376  *
1377  * - We also need make sure, for each switch core, we have allocated
1378  *   enough mbufs to fill up the mbuf cache.
1379  */
1380 static void
1381 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size,
1382 	uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache)
1383 {
1384 	uint32_t nr_mbufs;
1385 	uint32_t nr_mbufs_per_core;
1386 	uint32_t mtu = 1500;
1387 
1388 	if (mergeable)
1389 		mtu = 9000;
1390 	if (enable_tso)
1391 		mtu = 64 * 1024;
1392 
1393 	nr_mbufs_per_core  = (mtu + mbuf_size) * MAX_PKT_BURST /
1394 			(mbuf_size - RTE_PKTMBUF_HEADROOM);
1395 	nr_mbufs_per_core += nr_rx_desc;
1396 	nr_mbufs_per_core  = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache);
1397 
1398 	nr_mbufs  = nr_queues * nr_rx_desc;
1399 	nr_mbufs += nr_mbufs_per_core * nr_switch_core;
1400 	nr_mbufs *= nr_port;
1401 
1402 	mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs,
1403 					    nr_mbuf_cache, 0, mbuf_size,
1404 					    rte_socket_id());
1405 	if (mbuf_pool == NULL)
1406 		rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1407 }
1408 
1409 /*
1410  * Main function, does initialisation and calls the per-lcore functions.
1411  */
1412 int
1413 main(int argc, char *argv[])
1414 {
1415 	unsigned lcore_id, core_id = 0;
1416 	unsigned nb_ports, valid_num_ports;
1417 	int ret, i;
1418 	uint16_t portid;
1419 	static pthread_t tid;
1420 	uint64_t flags = 0;
1421 
1422 	signal(SIGINT, sigint_handler);
1423 
1424 	/* init EAL */
1425 	ret = rte_eal_init(argc, argv);
1426 	if (ret < 0)
1427 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1428 	argc -= ret;
1429 	argv += ret;
1430 
1431 	/* parse app arguments */
1432 	ret = us_vhost_parse_args(argc, argv);
1433 	if (ret < 0)
1434 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
1435 
1436 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1437 		TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1438 
1439 		if (rte_lcore_is_enabled(lcore_id))
1440 			lcore_ids[core_id++] = lcore_id;
1441 	}
1442 
1443 	if (rte_lcore_count() > RTE_MAX_LCORE)
1444 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
1445 
1446 	/* Get the number of physical ports. */
1447 	nb_ports = rte_eth_dev_count_avail();
1448 
1449 	/*
1450 	 * Update the global var NUM_PORTS and global array PORTS
1451 	 * and get value of var VALID_NUM_PORTS according to system ports number
1452 	 */
1453 	valid_num_ports = check_ports_num(nb_ports);
1454 
1455 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1456 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1457 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1458 		return -1;
1459 	}
1460 
1461 	/*
1462 	 * FIXME: here we are trying to allocate mbufs big enough for
1463 	 * @MAX_QUEUES, but the truth is we're never going to use that
1464 	 * many queues here. We probably should only do allocation for
1465 	 * those queues we are going to use.
1466 	 */
1467 	create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
1468 			 MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);
1469 
1470 	if (vm2vm_mode == VM2VM_HARDWARE) {
1471 		/* Enable VT loop back to let L2 switch to do it. */
1472 		vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1473 		RTE_LOG(DEBUG, VHOST_CONFIG,
1474 			"Enable loop back for L2 switch in vmdq.\n");
1475 	}
1476 
1477 	/* initialize all ports */
1478 	RTE_ETH_FOREACH_DEV(portid) {
1479 		/* skip ports that are not enabled */
1480 		if ((enabled_port_mask & (1 << portid)) == 0) {
1481 			RTE_LOG(INFO, VHOST_PORT,
1482 				"Skipping disabled port %d\n", portid);
1483 			continue;
1484 		}
1485 		if (port_init(portid) != 0)
1486 			rte_exit(EXIT_FAILURE,
1487 				"Cannot initialize network ports\n");
1488 	}
1489 
1490 	/* Enable stats if the user option is set. */
1491 	if (enable_stats) {
1492 		ret = rte_ctrl_thread_create(&tid, "print-stats", NULL,
1493 					print_stats, NULL);
1494 		if (ret < 0)
1495 			rte_exit(EXIT_FAILURE,
1496 				"Cannot create print-stats thread\n");
1497 	}
1498 
1499 	/* Launch all data cores. */
1500 	RTE_LCORE_FOREACH_SLAVE(lcore_id)
1501 		rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1502 
1503 	if (client_mode)
1504 		flags |= RTE_VHOST_USER_CLIENT;
1505 
1506 	if (dequeue_zero_copy)
1507 		flags |= RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
1508 
1509 	/* Register vhost user driver to handle vhost messages. */
1510 	for (i = 0; i < nb_sockets; i++) {
1511 		char *file = socket_files + i * PATH_MAX;
1512 		ret = rte_vhost_driver_register(file, flags);
1513 		if (ret != 0) {
1514 			unregister_drivers(i);
1515 			rte_exit(EXIT_FAILURE,
1516 				"vhost driver register failure.\n");
1517 		}
1518 
1519 		if (builtin_net_driver)
1520 			rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
1521 
1522 		if (mergeable == 0) {
1523 			rte_vhost_driver_disable_features(file,
1524 				1ULL << VIRTIO_NET_F_MRG_RXBUF);
1525 		}
1526 
1527 		if (enable_tx_csum == 0) {
1528 			rte_vhost_driver_disable_features(file,
1529 				1ULL << VIRTIO_NET_F_CSUM);
1530 		}
1531 
1532 		if (enable_tso == 0) {
1533 			rte_vhost_driver_disable_features(file,
1534 				1ULL << VIRTIO_NET_F_HOST_TSO4);
1535 			rte_vhost_driver_disable_features(file,
1536 				1ULL << VIRTIO_NET_F_HOST_TSO6);
1537 			rte_vhost_driver_disable_features(file,
1538 				1ULL << VIRTIO_NET_F_GUEST_TSO4);
1539 			rte_vhost_driver_disable_features(file,
1540 				1ULL << VIRTIO_NET_F_GUEST_TSO6);
1541 		}
1542 
1543 		if (promiscuous) {
1544 			rte_vhost_driver_enable_features(file,
1545 				1ULL << VIRTIO_NET_F_CTRL_RX);
1546 		}
1547 
1548 		ret = rte_vhost_driver_callback_register(file,
1549 			&virtio_net_device_ops);
1550 		if (ret != 0) {
1551 			rte_exit(EXIT_FAILURE,
1552 				"failed to register vhost driver callbacks.\n");
1553 		}
1554 
1555 		if (rte_vhost_driver_start(file) < 0) {
1556 			rte_exit(EXIT_FAILURE,
1557 				"failed to start vhost driver.\n");
1558 		}
1559 	}
1560 
1561 	RTE_LCORE_FOREACH_SLAVE(lcore_id)
1562 		rte_eal_wait_lcore(lcore_id);
1563 
1564 	return 0;
1565 
1566 }
1567